/*******************************************************************************

Name: Table 6 - hetergeneity across demographics and political affiliations

Authors: Jonathan Baker, Lori Bennear, Sheila Olmstead

Input files:
	water systems panel: waterSystems.dta
	2003 census data (for fips codes): est03ALL.xls
	state abbreviations: stateabbreviations.xlsx
	pwsid county matching (from GRPA tables): pwsidCountiesFY[2010-2013].xlsx
	1990 census: 90census.dta
	presidential election results: preselections_countyresults.dta
	Primary panel: natlCWS_yr_Panel_mod-small_v2-Match.dta
		
*******************************************************************************/


* ========================== START START START ==========================
clear
clear matrix
set more off

* ______________________________________________________________________________
* set paths and create log file
global root "C:/Users/jbaker/Documents/Research/SDWIS/Analysis/STATA"
global input "$root/Input"
global intermediate "$root/Intermediate"
global log "$root/LogFiles"
global output "$root/Output"
global outpanel "$root/OutputPanel"
global figures "$root/Figures"

log using "$log/Table6_heterogeneity.log", replace name(analysis)

* ============================================================================== 

* Step 1: prepare demographic and election results to merge into main panel

* ==============================================================================

*_______________________________________________________________________________
*** LOAD COUNTY-PWSID DATA (from 2013 GPRA tables)
 /* I generated these tables by double clicking on value in the "total" row 
    for each fiscal year */

foreach year of numlist 2010/2013 {
display in red "------------------------------------"
display in red "Importing `year' data"
import excel using "$input/pwsidCountiesFY`year'.xlsx", sheet("fy`year'") ///
	firstrow case(lower) clear
duplicates report
duplicates drop
keep pwsid pwsname pcounty pwstype state
duplicates report pwsid
sort pwsid 
save "$intermediate/TEMP-fy`year'.dta", replace
}

* check that state assignments are correct
foreach year of numlist 2010/2013 {
use "$intermediate/TEMP-fy`year'.dta", clear
display in red "---- TEMP-fy`year' ----"
	gen checkstate = substr(pwsid,1,2)
	gen flag = 0
	replace flag = 1 if checkstate != state
	tab flag
	drop flag checkstate
	}

/* merge county information across the four fiscal years

   Method: only replace a county assignment if it is blank and the subsequent
	   year's county assignment is populated */

use "$intermediate/TEMP-fy2010.dta", clear
rename pcounty pcounty2010

sort pwsid
merge 1:1 pwsid using "$intermediate/TEMP-fy2011.dta"
	gen checkcnty = 0
	replace checkcnty = 1 if pcounty2010 == "" & pcounty != ""
	replace pcounty2010 = pcounty if checkcnty == 1
	drop pcounty checkcnty _merge

sort pwsid
merge 1:1 pwsid using "$intermediate/TEMP-fy2012.dta"
	gen checkcnty = 0
	replace checkcnty = 1 if pcounty2010 == "" & pcounty != ""
	replace pcounty2010 = pcounty if checkcnty == 1
	drop pcounty checkcnty _merge

sort pwsid
merge 1:1 pwsid using "$intermediate/TEMP-fy2013.dta"
	gen checkcnty = 0
	replace checkcnty = 1 if pcounty2010 == "" & pcounty != ""
	replace pcounty2010 = pcounty if checkcnty == 1
	drop pcounty checkcnty _merge

sort pwsid
rename pcounty2010 pcounty
save "$intermediate/intermediatepanel.dta", replace

*_______________________________________________________________________________
* DEMOGRAPHICS DATA
use "$intermediate/intermediatepanel.dta", clear

duplicates report
duplicates report pwsid
sort pwsid
save "$intermediate/TEMP-pwsidCounty.dta", replace

*** merge counties data file with waterSystems.dta
  // purpose: associate county names with water systems used in analysis

use "$input/waterSystems.dta", clear
sort pwsid
duplicates report pwsid

merge 1:1 pwsid using "$intermediate/TEMP-pwsidCounty.dta"
sort pwsid pcounty

// keep matched community water system run by states 
// (use CWS definition from water systems data; i.e. pws_type_code)
keep if pws_type_code == "CWS" & primacy_type == "State" & _merge == 3
drop if pcounty == "" 
keep pwsid state pcounty pws_type_code

duplicates report
duplicates report pwsid
duplicates report state pcounty
sort state pcounty pwsid
save "$intermediate/TEMP-countyNames.dta", replace
	
*** LOAD COUNTY-CENSUS INFORMATION

/* data source notes:
data source:
https://www.census.gov/did/www/saipe/data/statecounty/data/index.html
(1998 link)

download date:
1/24/2017

data legend:
https://www.census.gov/did/www/saipe/data/statecounty/data/1998.html

using 2003 for now. 2003 is the earliest I can get an excel spreadsheet 
with an easily importable format
https://www.census.gov/did/www/saipe/data/statecounty/data/2003.html 

*/

// import raw data (deriving FIPS codes from 2003)
import excel using "$input/est03ALL.xls", ///
	cellrange(A2:AE3195) firstrow case(lower) allstring clear

// keep fips codes, state abbreviations, and county name
keep statefips countyfips postalcode name
rename postalcode state

// concatenate fips codes
replace statefips = "0" + statefips if strlen(statefips) == 1
replace countyfips = "00" + countyfips if strlen(countyfips) == 1
replace countyfips = "0" + countyfips if strlen(countyfips) == 2

// drop US and State only fips
drop if countyfips == "000"

// create 5-digit fips code
gen fips = statefips + countyfips
duplicates report state name
duplicates report fips

// modify county name variable
gen pcounty = name
replace pcounty = subinstr(pcounty," County","",1)
replace pcounty = subinstr(pcounty," Parish","",1)
replace pcounty = subinstr(pcounty," Borough","",1)
replace pcounty = subinstr(pcounty," Census Area","",1)
replace pcounty = strtrim(pcounty)
compress

// rename county names for matching with pwsidCounty data file
replace pcounty = "LaSalle" if state == "IL" & countyfips == "099"
replace pcounty = "DeKalb" if state == "IN" & countyfips == "033"
replace pcounty = "LaGrange" if state == "IN" & countyfips == "087"
replace pcounty = "LaPorte" if state == "IN" & countyfips == "091"
replace pcounty = "De Baca" if state == "NM" & countyfips == "011"
replace pcounty = "McKean" if state == "PA" & countyfips == "083"

// merge fips onto pwsids
duplicates report state pcounty
sort state pcounty
merge 1:m state pcounty using "$intermediate/TEMP-countyNames.dta"
	keep if _merge == 3
	sort pwsid
	drop _merge

keep pwsid fips

// save file
duplicates report pwsid
sort pwsid fips
save "$intermediate/TEMP-pwsidFIPS.dta", replace

*** MERGE 1990 census data

use "$input/90census.dta", clear
gen fips = state_fip + county_fip
sort fips
duplicates report fips
merge 1:m fips using "$intermediate/TEMP-pwsidFIPS.dta"
	list pwsid fips if _merge == 2
	tab _merge

keep if _merge==3
drop _merge state_fip county_fip
order pwsid mean_fam perc_* fips
duplicates report pwsid
sort pwsid

save "$intermediate/pwsid90census.dta", replace

*_______________________________________________________________________________
* ELECTIONS DATA

use "$intermediate/intermediatepanel.dta", clear
replace pcounty = strupper(pcounty)

duplicates report
duplicates report pwsid
sort pwsid

replace pcounty = "LA SALLE" if pcounty == "LASALLE" & st == "IL"
replace pcounty = "BALTIMORE CITY" if pcounty == "BALTIMORE" & st == "MD"
replace pcounty = "PRINCE GEORGES" if pcounty == "PRINCE GEORGE'S" & st == "MD"
replace pcounty = "QUEEN ANNES" if pcounty == "QUEEN ANNE'S" & st == "MD"
replace pcounty = "ST. MARYS" if pcounty == "ST. MARY'S" & st == "MD"
replace pcounty = "ST. LOUIS CITY" if pcounty == "ST. LOUIS" & st == "MO"
replace pcounty = "DE SOTO" if pcounty == "DESOTO" & st == "MS"
replace pcounty = "CARSON CITY" if pwsid == "NV0000015"
	// assuming that carson city public works serves carson city
replace pcounty = "DE WITT" if pcounty == "DEWITT" & st == "TX"

save "$intermediate/pwsidCounty.dta", replace
gen test = 0
replace test = 1 if pcounty == ""
tab state if test == 1
tab state if test == 1 & pwstype == "CWS"

// import and clean pres election data and merge with pwsid to county mapping

* match state abbreviations to state name
import excel using "$input/stateabbreviations.xlsx", firstrow clear
duplicates report
duplicates report state
sort state
save "$intermediate/stateabbreviations.dta", replace

* match pwsid values to the states in presidential elections
use "$input/preselections_countyresults.dta", clear
	tab _merge
	drop _merge
	sort state area racedate
	merge m:1 state using "$intermediate/stateabbreviations.dta"
	tab _merge
	drop _merge
	order statefips countyfips Office state st
	duplicates report st area racedate
	sort st area racedate
	save "$intermediate/preselections_countyresults_st.dta", replace

use "$intermediate/preselections_countyresults_st.dta", clear
keep st area
duplicates drop
rename area pcounty
rename st state
duplicates report state pcounty
sort state pcounty
save "$intermediate/TEMP-stcountyelec.dta", replace

use "$intermediate/pwsidCounty.dta", clear
duplicates report pwsid
sort pwsid state pcounty
	merge m:1 state pcounty using "$intermediate/TEMP-stcountyelec.dta"
	tab _merge
	tab state if _merge == 1 & pwstype == "CWS" // AK not in election data
	tab state if _merge == 2
	keep if _merge == 3
	drop _merge

* match pwsids back into state election data
gen racedate = "19921103"
gen raceyear = 1992
expand 3
bysort pwsid: gen cnt = _n
replace racedate = "19961105" if cnt == 2
replace raceyear = 1996 if cnt == 2
replace racedate = "20001107" if cnt == 3
replace raceyear = 2000 if cnt == 3
drop cnt
rename state st
rename pcounty area
order pwsid st area racedate raceyear
sort pwsid st area racedate
duplicates report pwsid st area racedate // no duplicates
	merge m:1 st area racedate using ///
		"$intermediate/preselections_countyresults_st.dta"
	tab st if _merge == 2
	keep if _merge == 3
	drop _merge

* drop some unnecessary variables
sort pwsid st area racedate
drop *notes *fips racedate censuspop redistricteddate
drop pwstype st area areaname state summarylevel

* destring numeric variables
destring(*percent), replace
local votevar "totalvotes repvotes demvotes thirdvotes othervotes pluralityvotes"
foreach v of local votevar {
	replace `v' = subinstr(`v', ",", "", .)
	replace `v' = subinstr(`v', "N/A", "", .)
	}
destring(*votes), replace
order pwsid raceyear *votes *percent
/* 
   for the few cases where the plurality party is not dem or rep, I assign the
   "winner" based on rep or dem vote share. Some elections had an equal 50/50 
   split; in these cases I follow the pluralityparty assignment from the 
   election data (and in all those cases, the plurality part is either R or D)
*/
gen Dwins = 0
	replace Dwins = 1 if demvotesmajorpercent > 50
	replace Dwins = 1 if demvotesmajorpercent == 50 & pluralityparty == "D" 
tab Dwins pluralityparty

* expand to match 1990 to 2001 panel size, assigning raceyears to appropriate
* calendar years
*	raceyear 1992: 1990-1995
*	raceyear 1996: 1996-1999
*	raceyear 2000: 2000-2001

expand 6 if raceyear == 1992
expand 4 if raceyear == 1996
expand 2 if raceyear == 2000

sort pwsid raceyear

* assign calendar years to each observation for matching onto panel
by pwsid: gen yrcnt = _n
gen year = 1989
replace year = year + yrcnt
order pwsid year raceyear
drop yrcnt
sort pwsid year
duplicates report pwsid year
compress
save "$intermediate/pwsid_panel_pres_elections.dta", replace

* ============================================================================== 

* Step 2: Table 6 results for demographic indicators

* ==============================================================================
use "$outpanel/natlCWS_yr_Panel_mod-small_v2-Match.dta", clear
xtset pwsidNUM year

// merge census data with water violations panel
sort pwsid year
merge m:1 pwsid using "$intermediate/pwsid90census.dta"
	tab _merge
	// about 14% of counties don't match
	keep if _merge == 3
	drop _merge
	sort pwsidNUM year

// create demographic interaction terms
local demograph "mean_family_inc perc_white perc_black perc_hs_ed perc_some_college perc_college"
local dthreshold "501 10k 100k"
foreach j of local demograph {
	gen Postx`j' = Post * `j'
	foreach i of local dthreshold {
		gen PostxT_`i'x`j' = PostxT_`i' * `j'
	}
}
	
local DV "HLTH_999"
display "Dependent variable is set to `DV'"
local threshold "501" // 10k 100k"
local id "id2001"
local demograph "mean_family_inc perc_white perc_black perc_hs_ed perc_some_college perc_college"

foreach req of local threshold {
	display "Table 6 (1) ------------------------------------------"
	reghdfe `DV' PostxT_`req' Postxsize Postxsize2 ///
	if `id'==1 & mflag`req'==1 [fweight=freq_`req'], ///
	a(i.pwsidNUM i.state#i.year) vce(cl pwsidNUM)
	display "Model: `e(cmdline)'"
	estimates save "$output/sysFEstyrFEfsize2-`id'-`req'-demogrph.ster", replace
	
	local tick = 1
	foreach j of local demograph {
		local tick = `tick' + 1
		display "Table 6 (`tick') ------------------------------------------"
		reghdfe `DV' PostxT_`req'x`j' PostxT_`req' Postx`j' ///
			Postxsize Postxsize2 ///
			if `id'==1 & mflag`req'==1 [fweight=freq_`req'], ///
			a(i.pwsidNUM i.state#i.year) vce(cl pwsidNUM)
		display "Model: `e(cmdline)'"
		estimates save "$output/sysFEstyrFEfsize2-`id'-`req'-demogrph`j'.ster", replace
	}
}

* ============================================================================== 

* Step 3: Table 6 results for political indicator

* ==============================================================================

use "$outpanel/natlCWS_yr_Panel_mod-small_v2-Match.dta", clear
sort pwsid year
duplicates report pwsid year
		merge 1:1 pwsid year using "$intermediate/pwsid_panel_pres_elections.dta"
		tab year if _merge == 1
		tab _merge if id2001 == 1
		keep if _merge == 3
		sort pwsidNUM year
		isid pwsidNUM year
		save "$intermediate/temp-panel.dta", replace

use "$intermediate/temp-panel.dta", replace
xtset pwsidNUM year
rename repvotestotalpercent rtp
rename demvotestotalpercent dtp
rename repvotesmajorpercent rmp
rename demvotesmajorpercent dmp

// create election results interaction terms
local elec "Dwins rtp dtp rmp dmp"
local dthreshold "501 10k 100k"
foreach j of local elec {
	gen Postx`j' = Post * `j'
	foreach i of local dthreshold {
		gen PostxT_`i'x`j' = PostxT_`i' * `j'
		gen T_`i'x`j' = T_`i' * `j'
	}
}	

local DV "HLTH_999"
display "Dependent variable is set to `DV'"
local threshold "501" // 10k 100k"
local id "id2001"
local elec "Dwins" /// rtp dtp rmp dmp"

foreach req of local threshold {
	display "(not reported) ------------------------------------------"
	reghdfe `DV' PostxT_`req' Postxsize Postxsize2 ///
		if `id'==1 & mflag`req'==1 [fweight=freq_`req'], ///
		a(i.pwsidNUM i.state#i.year) vce(cl pwsidNUM)
	display "Model: `e(cmdline)'"
	estimates save ///
	"$output/sysFEstyrFEfsize2-`id'-`req'-elec.ster", replace
	
	local tick = 1
	foreach j of local elec {
	display "Table 6 (8) ------------------------------------------"
		reghdfe `DV' PostxT_`req'x`j' PostxT_`req' T_`req'x`j' ///
			Postx`j' `j' Postxsize Postxsize2 ///
			if `id'==1 & mflag`req'==1 [fweight=freq_`req'], ///
			a(i.pwsidNUM i.state#i.year) vce(cl pwsidNUM)
		display "Model: `e(cmdline)'"
		estimates save ///
		"$output/sysFEstyrFEfsize2-`id'-`req'-elec`j'.ster", replace
	}
}
* ========================== DONE DONE DONE ==========================
log close analysis